import pymysql

# Scrapy settings for gbifSpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "gbifSpider"

SPIDER_MODULES = ["gbifSpider.spiders"]
NEWSPIDER_MODULE = "gbifSpider.spiders"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
# 设置默认ua
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'

# Obey robots.txt rules
# 机器人协议
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# 最大异步连接数
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# 网站访问间隔
DOWNLOAD_DELAY = 2

# 随机延迟，如果启用，当从相同的网站获取数据时，Scrapy将会等待一个随机的值 (0.5到1.5之间的一个随机值 * DOWNLOAD_DELAY)
RANDOMIZE_DOWNLOAD_DELAY = True

# The download delay setting will honor only one of:
# 单个域执行的并发（即同时）请求的最大数量
# CONCURRENT_REQUESTS_PER_DOMAIN = 1
# 单个ip执行的并发（即同时）请求的最大数量，覆盖CONCURRENT_REQUESTS_PER_DOMAIN设定
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# 是否启用cookies middleware。如果关闭，cookies将不会发送给web server。
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# 表明 telnet 终端 (及其插件)是否启用的布尔值
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:、
# Scrapy HTTP Request使用的默认header。由 DefaultHeadersMiddleware 产生。这里通常可以自己添加更完整
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# 中间件设置，key：中间件路径，value：优先级越小越优先
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    'gbifSpider.middlewares.RandomUserAgent': 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# 保存项目中启用的下载中间件及其顺序的字典。 更多内容请查看 激活下载器中间件
# DOWNLOADER_MIDDLEWARES 设置会与Scrapy定义的 DOWNLOADER_MIDDLEWARES_BASE 设置合并(但不是覆盖)， 而后根据顺序(order)进行排序，
# 最后得到启用中间件的有序列表: 第一个中间件是最靠近引擎的，最后一个中间件是最靠近下载器的。
# 关于如何分配中间件的顺序请查看 DOWNLOADER_MIDDLEWARES_BASE 设置，
# 而后根据您想要放置中间件的位置选择一个值。
# 由于每个中间件执行不同的动作，您的中间件可能会依赖于之前(或者之后)执行的中间件，
# 因此顺序是很重要的
#DOWNLOADER_MIDDLEWARES = {
#    'bid.middlewares.BidDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}



# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# 启用自动限速AutoThrottle扩展
#AUTOTHROTTLE_ENABLED = True

# The initial download delay
# 初始下载延迟
# AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# 请求超时
DOWNLOAD_TIMEOUT = 300

# ---日志---
# 是否开启日志
# LOG_ENABLED= True
# 日志等级(向上包含)
    # CRITICAL -- 关键错误
    # ERROR -- 一般级别的错误
    # WARNING -- 警告信息
    # INFO -- 信息消息的日志（建议生产模式使用）
    # DEBUG -- 调试消息的日志（建议开发模式）
LOG_LEVEL = 'ERROR'

# 日志编码，默认utf-8
# LOG_ENCODING = 'utf-8'

# 日志输出文件，默认为None
# LOG_FILE ='....log'

# 是否打印日志，默认开启
# LOG_STDOUT = True


# ---条件退出爬虫---
# 打开EXTENSIONS扩展
# EXTENSIONS = {
# 'scrapy.extensions.closespider.CloseSpider': 500,
# }
# 指定时间退出 S
# CLOSESPIDER_TIMEOUT = 20
# 生成了指定数量的item
# CLOSESPIDER_ITEMCOUNT = 1
# 抓取了指定数量的响应
# CLOSESPIDER_PAGECOUNT
# 在发生指定数量的错误
# CLOSESPIDER_ERRORCOUNT

# 加入ua列表
USER_AGENT_LIST = [
    'Mozilla/4.0 (Windows; MSIE 6.0; Windows NT 5.2)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
    'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)',
    'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv 11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36Edge/13.10586',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows Phone OS 7.0; Trident/3.1; IEMobile/7.0; LG; GW910)',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; SAMSUNG; SGH-i917)',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 920)',
    'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 6_1_4 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) CriOS/27.0.1453.10 Mobile/10B350 Safari/8536.25',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/27.0.1453.93 Chrome/27.0.1453.93 Safari/537.36',
    'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19',
    'Mozilla/5.0 (Linux; Android 4.1.2; Nexus 7 Build/JZ054K) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19',
    'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
    'Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3',
    'Mozilla/5.0 (iPad; CPU OS 8_1_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B466 Safari/600.1.4',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12A366 Safari/600.1.4',
    'Mozilla/5.0 (iPod; CPU iPhone OS 5_1_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B206 Safari/7534.48.3',
    'Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0',
    'Mozilla/5.0 (Linux; Android 4.1.2; Nexus 7 Build/JZ054K) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0',
    'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0',
    'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.9.168 Version/11.52',
    'Opera/9.80 (Windows NT 6.1; WOW64; U; en) Presto/2.10.229 Version/11.62'
]

# 代理列表
PROXY_LIST = [
    {'ip_port': 'ip:port', 'user_password':'user:password'},
    {'ip_port': 'ip:port'}
]


# 开启中间键
DOWNLOADER_MIDDLEWARES = {
   'gbifSpider.middlewares.RedirectMiddleware': 541,
   # 'bid.middlewares.RandomProxy': 542,
   # 'bid.middlewares.SeleniumMiddleware': 543,
}

# 设置MySql数据库
DB_SETTINGS = {
    'gbif': {
        'host': '127.0.0.1',
        'db': 'spider',
        'user': 'root',
        'password': '123456',
        'port': 3306,
        'cursorclass': pymysql.cursors.DictCursor,  # 指定cursor类型,
    },
}

# 配置MySQL
MySQInit = {
   'host': '127.0.0.1',
   'port': 3306,
   'user': 'root',
   'password': '123456',
   'db': 'spider',
   'charset': 'utf8',
   'table': 'gbif'
}
# 配置Elasticsearch
ElasticsearchInit = {
   'host':['http://127.0.0.1:9200'],# es集群ip列表，入默认本机可留空
   'http_auth':(),# 如有用户验证元组中加入'user', 'passwd'
   'index':'bid'
}
# 配置MongoDB
MongoDBInit = {
   'host':'host',
   'port':'port',
   # 'auth':{
   #    'username': '',
   #    'password': '',
   # },
   'db':'db',
   'col':'col'
}
RedisInit = '127.0.0.1'
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# 开启Item管道
ITEM_PIPELINES = {
   'gbifSpider.pipelines.GbifSpiderPipeline': 300,
   'gbifSpider.pipelines.GbifImagesPipeline': 301,
   # 'bid.pipelines.MysqlPipeline': 300,
   # 'gbifSpider.pipelines.ElasticsearchPipeline': 301,
   # 'bid.pipelines.MongoDBPipelinePipeline': 301,
}

# 允许重定向等
HTTPERROR_ALLOWED_CODES = [301, 302]

IMAGES_STORE = './images'

COMMANDS_MODULE = 'gbifSpider.commands'